take home exercise 2 Singapore public bus commuter flows

Author

Xu Lin

#overview We currently have data on people’s movement patterns as well as information on schools, businesses, and retail activities in different areas. The goal of this task is to identify common Saturday activities among the population. Through our analysis, we aim to provide recommendations to the government, suggesting potential enhancements to existing facilities or proposing the development of new amenities that align with people’s preferences. The objective is to make these facilities more appealing and strategically located for the community’s convenience.

#Objective Our goal is to pinpoint popular weekend destinations, analyze the main facilities in those areas, and provide recommendations accordingly.

#Data Geospatial data: Passenger Volume by Origin Destination Bus Stops, Bus Stop Location, Train Station and Train Station Exit Point, Master Plan 2019 Subzone Boundary, HDB Property Information, Business, Entertn, F&B, FinServ, Leisure&Recreation and Retails. Aspatial data: HDB Property Information. This data is for us to use.

#Import the data

pacman::p_load(tmap, sf, DT, stplanr, sp, dplyr,
               performance, reshape2, units, 
               ggpubr, tidyverse, mapview, httr, sfheaders, knitr, kableExtra)

#Importing the OD data

odbus <- read_csv("data/aspatial/origin_destination_bus_202310.csv")
Rows: 5694297 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): YEAR_MONTH, DAY_TYPE, PT_TYPE, ORIGIN_PT_CODE, DESTINATION_PT_CODE
dbl (2): TIME_PER_HOUR, TOTAL_TRIPS

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
odbus$ORIGIN_PT_CODE <- as.factor(odbus$ORIGIN_PT_CODE)
odbus$DESTINATION_PT_CODE <- as.factor(odbus$DESTINATION_PT_CODE) 
glimpse(odbus)
Rows: 5,694,297
Columns: 7
$ YEAR_MONTH          <chr> "2023-10", "2023-10", "2023-10", "2023-10", "2023-…
$ DAY_TYPE            <chr> "WEEKENDS/HOLIDAY", "WEEKDAY", "WEEKENDS/HOLIDAY",…
$ TIME_PER_HOUR       <dbl> 16, 16, 14, 14, 17, 17, 17, 7, 14, 14, 10, 20, 20,…
$ PT_TYPE             <chr> "BUS", "BUS", "BUS", "BUS", "BUS", "BUS", "BUS", "…
$ ORIGIN_PT_CODE      <fct> 04168, 04168, 80119, 80119, 44069, 20281, 20281, 1…
$ DESTINATION_PT_CODE <fct> 10051, 10051, 90079, 90079, 17229, 20141, 20141, 1…
$ TOTAL_TRIPS         <dbl> 3, 5, 3, 5, 4, 1, 24, 2, 1, 7, 3, 2, 5, 1, 1, 1, 1…
weekendmorning11_14 <- odbus %>%
  filter(DAY_TYPE == "WEEKENDS/HOLIDAY") %>%
  filter(TIME_PER_HOUR >= 11 & TIME_PER_HOUR <= 14) %>%
  group_by(ORIGIN_PT_CODE, DESTINATION_PT_CODE) %>%
  summarise(TRIPS = sum(TOTAL_TRIPS), .groups = 'keep')
datatable(weekendmorning11_14)
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
write_rds(weekendmorning11_14, "data/rds/weekendmorning11_14.rds")
weekendmorning11_14 <- read_rds("data/rds/weekendmorning11_14.rds")

#Working with Geospatial Data

busstop <- st_read(dsn = "data/geospatial", layer = "BusStop") %>%
  st_transform(crs = 3414)
Reading layer `BusStop' from data source 
  `/Users/linxu/ISSS624/take home exercise 2/data/geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 5161 features and 3 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 3970.122 ymin: 26482.1 xmax: 48284.56 ymax: 52983.82
Projected CRS: SVY21
glimpse(busstop)
Rows: 5,161
Columns: 4
$ BUS_STOP_N <chr> "22069", "32071", "44331", "96081", "11561", "66191", "2338…
$ BUS_ROOF_N <chr> "B06", "B23", "B01", "B05", "B05", "B03", "B02A", "B02", "B…
$ LOC_DESC   <chr> "OPP CEVA LOGISTICS", "AFT TRACK 13", "BLK 239", "GRACE IND…
$ geometry   <POINT [m]> POINT (13576.31 32883.65), POINT (13228.59 44206.38),…
busstop_points = busstop %>%
  st_as_sf(coords = c("geometry"), crs = 3414, remove = FALSE)
mapview_busstop_points = mapview(busstop_points, cex = 0.5, alpha = .5, popup = NULL)
mapview_busstop_points
area_honeycomb_grid = st_make_grid(busstop_points, c(375, 375), what = "polygons", square = FALSE)
honeycomb_grid_sf = st_sf(area_honeycomb_grid) %>%
  mutate(grid_id = 1:length(lengths(area_honeycomb_grid)))
honeycomb_grid_sf$n_colli = lengths(st_intersects(honeycomb_grid_sf, busstop_points))
honeycomb_count = filter(honeycomb_grid_sf, n_colli > 0)
map_honeycomb = tm_shape(honeycomb_count) +
  tm_fill(
    col = "n_colli",
    palette = "Reds",
    style = "cont",
    title = "Number of collisions",
    id = "grid_id",
    showNA = FALSE,
    alpha = 0.6,
    popup.vars = c(
      "Number of collisions: " = "n_colli"
    ),
    popup.format = list(
      n_colli = list(format = "f", digits = 0)
    )
  ) +
  tm_borders(col = "grey40", lwd = 0.7)
map_honeycomb

tmap_mode("plot")
tmap mode set to plotting
busstop_honeycomb_count <- st_intersection(busstop, honeycomb_count) %>%
  select(BUS_STOP_N, grid_id) %>%
  st_drop_geometry()
Warning: attribute variables are assumed to be spatially constant throughout
all geometries
glimpse(busstop_honeycomb_count)
Rows: 5,162
Columns: 2
$ BUS_STOP_N <chr> "25059", "25059", "25751", "26379", "26369", "25761", "2638…
$ grid_id    <int> 3, 86, 170, 173, 174, 211, 214, 255, 255, 258, 295, 296, 29…
write_rds(busstop_honeycomb_count, "data/rds/busstop_honeycomb_count.rds")  
weekendmorning11_14 <- left_join(weekendmorning11_14 , busstop_honeycomb_count,
            by = c("ORIGIN_PT_CODE" = "BUS_STOP_N")) %>%
  rename(ORIGIN_BS = ORIGIN_PT_CODE,
         ORIGIN_SZ = grid_id,
         DESTIN_BS = DESTINATION_PT_CODE)
Warning in left_join(weekendmorning11_14, busstop_honeycomb_count, by = c(ORIGIN_PT_CODE = "BUS_STOP_N")): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 27736 of `x` matches multiple rows in `y`.
ℹ Row 3165 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
duplicate <- weekendmorning11_14 %>%
  group_by_all() %>%
  filter(n()>1) %>%
  ungroup()
weekendmorning11_14 <- unique(weekendmorning11_14)
weekendmorning11_14 <- left_join(weekendmorning11_14 , busstop_honeycomb_count,
            by = c("DESTIN_BS" = "BUS_STOP_N")) 
Warning in left_join(weekendmorning11_14, busstop_honeycomb_count, by = c(DESTIN_BS = "BUS_STOP_N")): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 206 of `x` matches multiple rows in `y`.
ℹ Row 3203 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
duplicate <- weekendmorning11_14 %>%
  group_by_all() %>%
  filter(n()>1) %>%
  ungroup()
weekendmorning11_14 <- unique(weekendmorning11_14)
weekendmorning11_14 <- weekendmorning11_14 %>%
  rename(DESTIN_SZ = grid_id) %>%
  drop_na() %>%
  group_by(ORIGIN_SZ, DESTIN_SZ) %>%
  summarise(WEEKENDDAYMORNING_PEAK = sum(TRIPS))
`summarise()` has grouped output by 'ORIGIN_SZ'. You can override using the
`.groups` argument.
write_rds(weekendmorning11_14, "data/rds/weekendmorning11_14.rds")
weekendmorning11_14 <- read_rds("data/rds/weekendmorning11_14.rds")
weekendmorning11_14_1 <- weekendmorning11_14[weekendmorning11_14$ORIGIN_SZ!=weekendmorning11_14$DESTIN_SZ,]
flowLine <- od2line(flow = weekendmorning11_14_1, 
                    zones = honeycomb_count,
                    zone_code = "grid_id")
Creating centroids representing desire line start and end points.
tm_shape(honeycomb_count) +
  tm_polygons() +
flowLine %>%  
tm_shape() +
  tm_lines(lwd = "WEEKENDDAYMORNING_PEAK",
           style = "quantile",
           scale = c(0.1, 1, 3, 5, 7, 10),
           n = 6,
           alpha = 0.3)
Warning in g$scale * (w_legend/maxW): longer object length is not a multiple of
shorter object length
Warning in g$scale * (x/maxW): longer object length is not a multiple of
shorter object length

tm_shape(honeycomb_count) +
  tm_polygons() +
flowLine %>%  
  filter(WEEKENDDAYMORNING_PEAK >= 2000) %>%
tm_shape() +
  tm_lines(lwd = "WEEKENDDAYMORNING_PEAK",
           style = "quantile",
           scale = c(0.1, 1, 3, 5, 7, 10),
           n = 6,
           alpha = 0.3)
Warning in g$scale * (w_legend/maxW): longer object length is not a multiple of
shorter object length
Warning in g$scale * (x/maxW): longer object length is not a multiple of
shorter object length

Caculate the distance

honeycomb_count <- read_rds("data/rds/honeycomb_count.rds")
honeycomb_count
Simple feature collection with 2176 features and 2 fields
Geometry type: POLYGON
Dimension:     XY
Bounding box:  xmin: 3595.122 ymin: 26265.59 xmax: 48595.12 ymax: 53328.89
Projected CRS: SVY21 / Singapore TM
First 10 features:
              area_honeycomb_grid grid_id n_colli
1  POLYGON ((3782.622 27889.39...       3       1
2  POLYGON ((4157.622 27889.39...      86       1
3  POLYGON ((4532.622 28538.91...     170       1
4  POLYGON ((4532.622 30487.47...     173       1
5  POLYGON ((4532.622 31136.99...     174       1
6  POLYGON ((4720.122 28214.15...     211       1
7  POLYGON ((4720.122 30162.71...     214       1
8  POLYGON ((4907.622 29837.95...     255       2
9  POLYGON ((4907.622 31786.51...     258       1
10 POLYGON ((5095.122 28863.67...     295       1
honeycomb_count_sp <- as(honeycomb_count, "Spatial")
honeycomb_count_sp
class       : SpatialPolygonsDataFrame 
features    : 2176 
extent      : 3595.122, 48595.12, 26265.59, 53328.89  (xmin, xmax, ymin, ymax)
crs         : +proj=tmerc +lat_0=1.36666666666667 +lon_0=103.833333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs 
variables   : 2
names       : grid_id, n_colli 
min values  :       3,       1 
max values  :    9888,      10 
dist <- spDists(honeycomb_count_sp, 
                longlat = FALSE)
head(dist, n=c(10, 10))
           [,1]      [,2]      [,3]      [,4]      [,5]      [,6]      [,7]
 [1,]    0.0000  375.0000  992.1567 2704.1635 3333.0729  992.1567 2459.0394
 [2,]  375.0000    0.0000  750.0000 2625.0000 3269.1742  649.5191 2341.8742
 [3,]  992.1567  750.0000    0.0000 1948.5572 2598.0762  375.0000 1634.5871
 [4,] 2704.1635 2625.0000 1948.5572    0.0000  649.5191 2281.0359  375.0000
 [5,] 3333.0729 3269.1742 2598.0762  649.5191    0.0000 2928.8436  992.1567
 [6,]  992.1567  649.5191  375.0000 2281.0359 2928.8436    0.0000 1948.5572
 [7,] 2459.0394 2341.8742 1634.5871  375.0000  992.1567 1948.5572    0.0000
 [8,] 2250.0000 2087.9116 1352.0817  750.0000 1352.0817 1634.5871  375.0000
 [9,] 4056.2452 3968.6270 3269.1742 1352.0817  750.0000 3577.2720 1634.5871
[10,] 1634.5871 1352.0817  649.5191 1718.4659 2341.8742  750.0000 1352.0817
           [,8]     [,9]     [,10]
 [1,] 2250.0000 4056.245 1634.5871
 [2,] 2087.9116 3968.627 1352.0817
 [3,] 1352.0817 3269.174  649.5191
 [4,]  750.0000 1352.082 1718.4659
 [5,] 1352.0817  750.000 2341.8742
 [6,] 1634.5871 3577.272  750.0000
 [7,]  375.0000 1634.587 1352.0817
 [8,]    0.0000 1948.557  992.1567
 [9,] 1948.5572    0.000 2928.8436
[10,]  992.1567 2928.844    0.0000
sz_names <- honeycomb_count$grid_id
colnames(dist) <- paste0(sz_names)
rownames(dist) <- paste0(sz_names)
distPair <- melt(dist) %>%
  rename(dist = value)
head(distPair, 10)
   Var1 Var2      dist
1     3    3    0.0000
2    86    3  375.0000
3   170    3  992.1567
4   173    3 2704.1635
5   174    3 3333.0729
6   211    3  992.1567
7   214    3 2459.0394
8   255    3 2250.0000
9   258    3 4056.2452
10  295    3 1634.5871
distPair %>%
  filter(dist > 0) %>%
  summary()
      Var1           Var2           dist      
 Min.   :   3   Min.   :   3   Min.   :  375  
 1st Qu.:3470   1st Qu.:3470   1st Qu.: 7902  
 Median :5300   Median :5300   Median :12898  
 Mean   :5037   Mean   :5037   Mean   :13632  
 3rd Qu.:6656   3rd Qu.:6656   3rd Qu.:18283  
 Max.   :9888   Max.   :9888   Max.   :44926  
distPair$dist <- ifelse(distPair$dist == 0,
                        50, distPair$dist)
distPair %>%
  summary()
      Var1           Var2           dist      
 Min.   :   3   Min.   :   3   Min.   :   50  
 1st Qu.:3470   1st Qu.:3470   1st Qu.: 7902  
 Median :5300   Median :5300   Median :12898  
 Mean   :5037   Mean   :5037   Mean   :13625  
 3rd Qu.:6656   3rd Qu.:6656   3rd Qu.:18283  
 Max.   :9888   Max.   :9888   Max.   :44926  
distPair <- distPair %>%
  rename(orig = Var1,
         dest = Var2)
write_rds(distPair, "data/rds/distPair.rds") 
weekendmorning11_14 <- read_rds("data/rds/weekendmorning11_14.rds")
flow_data <- weekendmorning11_14 %>%
  group_by(ORIGIN_SZ, DESTIN_SZ) %>% 
  summarize(TRIPS = sum(WEEKENDDAYMORNING_PEAK)) 
`summarise()` has grouped output by 'ORIGIN_SZ'. You can override using the
`.groups` argument.
head(flow_data)
# A tibble: 6 × 3
# Groups:   ORIGIN_SZ [2]
  ORIGIN_SZ DESTIN_SZ TRIPS
      <int>     <int> <dbl>
1       170       377     4
2       170       462     2
3       170       594    20
4       173       301     1
5       173       342     5
6       173       554     3
flow_data$FlowNoIntra <- ifelse(
  flow_data$ORIGIN_SZ == flow_data$DESTIN_SZ, 
  0, flow_data$TRIPS)
flow_data$offset <- ifelse(
  flow_data$ORIGIN_SZ == flow_data$DESTIN_SZ, 
  0.000001, 1)
flow_data$ORIGIN_SZ <- as.integer(flow_data$ORIGIN_SZ)
flow_data$DESTIN_SZ <- as.integer(flow_data$DESTIN_SZ)
flow_data1 <- flow_data %>%
  left_join (distPair,
             by = c("ORIGIN_SZ" = "orig",
                    "DESTIN_SZ" = "dest"))
flow_data1 <- flow_data1 %>%
  left_join(honeycomb_count,
            by = c("DESTIN_SZ" = "grid_id")) %>%
  rename(DIST = dist)
summary(flow_data1)
   ORIGIN_SZ      DESTIN_SZ        TRIPS           FlowNoIntra      
 Min.   : 170   Min.   : 170   Min.   :    1.00   Min.   :    0.00  
 1st Qu.:4569   1st Qu.:4573   1st Qu.:    2.00   1st Qu.:    2.00  
 Median :5699   Median :5698   Median :    8.00   Median :    7.00  
 Mean   :5623   Mean   :5616   Mean   :   46.69   Mean   :   46.51  
 3rd Qu.:6775   3rd Qu.:6743   3rd Qu.:   26.00   3rd Qu.:   26.00  
 Max.   :9888   Max.   :9888   Max.   :43417.00   Max.   :43417.00  
     offset              DIST          area_honeycomb_grid    n_colli     
 Min.   :0.000001   Min.   :   50   POLYGON      :161094   Min.   : 1.00  
 1st Qu.:1.000000   1st Qu.: 2088   epsg:3414    :     0   1st Qu.: 2.00  
 Median :1.000000   Median : 4226   +proj=tmer...:     0   Median : 3.00  
 Mean   :0.995934   Mean   : 5213                          Mean   : 2.89  
 3rd Qu.:1.000000   3rd Qu.: 7377                          3rd Qu.: 4.00  
 Max.   :1.000000   Max.   :24827                          Max.   :10.00  
write_rds(flow_data1,
          "data/rds/flow_data1.rds")

Make flow data with weekend activies

url<-"https://www.onemap.gov.sg/api/common/elastic/search"

csv<-read_csv("data/aspatial/Generalinformationofschools.csv")
Rows: 346 Columns: 31
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (31): school_name, url_address, address, postal_code, telephone_no, tele...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
postcodes<-csv$`postal_code`

found<-data.frame()
not_found<-data.frame()

for(postcode in postcodes){
  query<-list('searchVal'=postcode,'returnGeom'='Y','getAddrDetails'='Y','pageNum'='1')
  res<- GET(url,query=query)
  
  if((content(res)$found)!=0){
    found<-rbind(found,data.frame(content(res))[4:13])
  } else{
    not_found = data.frame(postcode)
  }
}
merged = merge(csv, found, by.x = 'postal_code', by.y = 'results.POSTAL', all = TRUE)
write.csv(merged, file = "data/aspatial/schools.csv")
write.csv(not_found, file = "data/aspatial/not_found.csv")
schools <- read_csv("data/aspatial/schools.csv") %>%
  rename(latitude = "results.LATITUDE",
         longitude = "results.LONGITUDE")%>%
  select(postal_code, school_name, latitude, longitude)
New names:
Rows: 350 Columns: 41
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(36): postal_code, school_name, url_address, address, telephone_no, tele... dbl
(5): ...1, results.X, results.Y, results.LATITUDE, results.LONGITUDE
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
schools <- schools[complete.cases(schools$longitude, schools$latitude), ]
schools_sf <- st_as_sf(schools, 
                       coords = c("longitude", "latitude"),
                       crs=4326) %>%
  st_transform(crs = 3414)
tmap_options(check.and.fix = TRUE)
tm_shape(honeycomb_count) +
  tm_polygons() +
tm_shape(schools_sf) +
  tm_dots()

tmap_mode("plot")
tmap mode set to plotting
honeycomb_count$`SCHOOL_COUNT`<- lengths(
  st_intersects(
   honeycomb_count, schools_sf))
summary(honeycomb_count$SCHOOL_COUNT)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.1333  0.0000  4.0000 

##公交站转换

RTSS_sf <- st_read(dsn = "data/geospatial/",
                layer = "RapidTransitSystemStation")%>%
  st_transform(crs = 3414)
Reading layer `RapidTransitSystemStation' from data source 
  `/Users/linxu/ISSS624/take home exercise 2/data/geospatial' 
  using driver `ESRI Shapefile'
Warning in CPL_read_ogr(dsn, layer, query, as.character(options), quiet, : GDAL
Message 1: Non closed ring detected. To avoid accepting it, set the
OGR_GEOMETRY_ACCEPT_UNCLOSED_RING configuration option to NO
Simple feature collection with 220 features and 4 fields
Geometry type: POLYGON
Dimension:     XY
Bounding box:  xmin: 6068.209 ymin: 27478.44 xmax: 45377.5 ymax: 47913.58
Projected CRS: SVY21
RTSS_sf_polygons <- RTSS_sf[st_is_valid(RTSS_sf), ]
RTSS_sf_points <- st_centroid(RTSS_sf_polygons)
Warning: st_centroid assumes attributes are constant over geometries
tmap_options(check.and.fix = TRUE)
tm_shape(honeycomb_count) +
  tm_polygons() +
tm_shape(RTSS_sf_points) +
  tm_dots()
Warning: The shape RTSS_sf_points contains empty units.

tmap_mode("plot")
tmap mode set to plotting
honeycomb_count$`RTSS_COUNT`<- lengths(
  st_intersects(
    honeycomb_count, RTSS_sf_points))
summary(honeycomb_count$`RTSS_COUNT`)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.00000 0.00000 0.08823 0.00000 4.00000 

entertn

entertn_sf <- st_read(dsn = "data/geospatial/",
                layer = "entertn")%>%
  st_transform(crs = 3414)
Reading layer `entertn' from data source 
  `/Users/linxu/ISSS624/take home exercise 2/data/geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 114 features and 3 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 10809.34 ymin: 26528.63 xmax: 41600.62 ymax: 46375.77
Projected CRS: SVY21 / Singapore TM
tmap_options(check.and.fix = TRUE)
tm_shape(honeycomb_count) +
  tm_polygons() +
tm_shape(entertn_sf) +
  tm_dots()

tmap_mode("plot")
tmap mode set to plotting
honeycomb_count$`ENTERTN_COUNT`<- lengths(
  st_intersects(
    honeycomb_count, entertn_sf))
summary(honeycomb_count$ENTERTN_COUNT)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.0455  0.0000  7.0000 

Liesure&Recreation

lr_sf <- st_read(dsn = "data/geospatial/",
                                    layer = "Liesure&Recreation") %>%
  st_transform(crs = 3414)
Reading layer `Liesure&Recreation' from data source 
  `/Users/linxu/ISSS624/take home exercise 2/data/geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 1217 features and 30 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 6010.495 ymin: 25134.28 xmax: 48439.77 ymax: 50078.88
Projected CRS: SVY21 / Singapore TM
tmap_options(check.and.fix = TRUE)
tm_shape(honeycomb_count) +
  tm_polygons() +
tm_shape(lr_sf) +
  tm_dots()

tmap_mode("plot")
tmap mode set to plotting
honeycomb_count$`LR_COUNT`<- lengths(
  st_intersects(
    honeycomb_count, lr_sf))
summary(honeycomb_count$LR_COUNT)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.4007  0.0000 23.0000 

##Retails

retails_sf <- st_read(dsn = "data/geospatial/",
                layer = "Retails")%>%
  st_transform(crs = 3414)
Reading layer `Retails' from data source 
  `/Users/linxu/ISSS624/take home exercise 2/data/geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 37635 features and 3 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 4737.982 ymin: 25171.88 xmax: 48265.04 ymax: 50135.28
Projected CRS: SVY21 / Singapore TM
tmap_options(check.and.fix = TRUE)
tm_shape(honeycomb_count) +
  tm_polygons() +
tm_shape(retails_sf) +
  tm_dots()

tmap_mode("plot")
tmap mode set to plotting
honeycomb_count$`RETAILS_COUNT`<- lengths(
  st_intersects(
    honeycomb_count, retails_sf))
summary(honeycomb_count$RETAILS_COUNT)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    0.00    2.00   15.42    9.00  987.00 

#数据整合

honeycomb_count_tidy <- honeycomb_count %>%
  st_drop_geometry() %>%
  select(grid_id, SCHOOL_COUNT, RTSS_COUNT, ENTERTN_COUNT, LR_COUNT, RETAILS_COUNT)
flow_data1 <- flow_data1 %>%
  left_join(honeycomb_count_tidy,
            by = c("DESTIN_SZ" = "grid_id")) 
summary(flow_data1)
   ORIGIN_SZ      DESTIN_SZ        TRIPS           FlowNoIntra      
 Min.   : 170   Min.   : 170   Min.   :    1.00   Min.   :    0.00  
 1st Qu.:4569   1st Qu.:4573   1st Qu.:    2.00   1st Qu.:    2.00  
 Median :5699   Median :5698   Median :    8.00   Median :    7.00  
 Mean   :5623   Mean   :5616   Mean   :   46.69   Mean   :   46.51  
 3rd Qu.:6775   3rd Qu.:6743   3rd Qu.:   26.00   3rd Qu.:   26.00  
 Max.   :9888   Max.   :9888   Max.   :43417.00   Max.   :43417.00  
     offset              DIST          area_honeycomb_grid    n_colli     
 Min.   :0.000001   Min.   :   50   POLYGON      :161094   Min.   : 1.00  
 1st Qu.:1.000000   1st Qu.: 2088   epsg:3414    :     0   1st Qu.: 2.00  
 Median :1.000000   Median : 4226   +proj=tmer...:     0   Median : 3.00  
 Mean   :0.995934   Mean   : 5213                          Mean   : 2.89  
 3rd Qu.:1.000000   3rd Qu.: 7377                          3rd Qu.: 4.00  
 Max.   :1.000000   Max.   :24827                          Max.   :10.00  
  SCHOOL_COUNT      RTSS_COUNT     ENTERTN_COUNT       LR_COUNT      
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.0000  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 0.0000  
 Median :0.0000   Median :0.0000   Median :0.0000   Median : 0.0000  
 Mean   :0.1399   Mean   :0.1862   Mean   :0.1138   Mean   : 0.7081  
 3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.: 1.0000  
 Max.   :4.0000   Max.   :4.0000   Max.   :7.0000   Max.   :23.0000  
 RETAILS_COUNT   
 Min.   :  0.00  
 1st Qu.:  1.00  
 Median :  6.00  
 Mean   : 35.52  
 3rd Qu.: 26.00  
 Max.   :987.00  
flow_data1$SCHOOL_COUNT <- ifelse(
  flow_data1$SCHOOL_COUNT == 0,
  0.99, flow_data1$SCHOOL_COUNT)
flow_data1$RTSS_COUNT <- ifelse(
  flow_data1$RTSS_COUNT == 0,
  0.99, flow_data1$RTSS_COUNT)
flow_data1$ENTERTN_COUNT <- ifelse(
  flow_data1$ENTERTN_COUNT == 0,
  0.99, flow_data1$ENTERTN_COUNT)
flow_data1$LR_COUNT <- ifelse(
  flow_data1$LR_COUNT == 0,
  0.99, flow_data1$LR_COUNT)
flow_data1$RETAILS_COUNT <- ifelse(
  flow_data1$RETAILS_COUNT == 0,
  0.99, flow_data1$RETAILS_COUNT)
summary(flow_data1)
   ORIGIN_SZ      DESTIN_SZ        TRIPS           FlowNoIntra      
 Min.   : 170   Min.   : 170   Min.   :    1.00   Min.   :    0.00  
 1st Qu.:4569   1st Qu.:4573   1st Qu.:    2.00   1st Qu.:    2.00  
 Median :5699   Median :5698   Median :    8.00   Median :    7.00  
 Mean   :5623   Mean   :5616   Mean   :   46.69   Mean   :   46.51  
 3rd Qu.:6775   3rd Qu.:6743   3rd Qu.:   26.00   3rd Qu.:   26.00  
 Max.   :9888   Max.   :9888   Max.   :43417.00   Max.   :43417.00  
     offset              DIST          area_honeycomb_grid    n_colli     
 Min.   :0.000001   Min.   :   50   POLYGON      :161094   Min.   : 1.00  
 1st Qu.:1.000000   1st Qu.: 2088   epsg:3414    :     0   1st Qu.: 2.00  
 Median :1.000000   Median : 4226   +proj=tmer...:     0   Median : 3.00  
 Mean   :0.995934   Mean   : 5213                          Mean   : 2.89  
 3rd Qu.:1.000000   3rd Qu.: 7377                          3rd Qu.: 4.00  
 Max.   :1.000000   Max.   :24827                          Max.   :10.00  
  SCHOOL_COUNT     RTSS_COUNT    ENTERTN_COUNT      LR_COUNT     
 Min.   :0.990   Min.   :0.990   Min.   :0.990   Min.   : 0.990  
 1st Qu.:0.990   1st Qu.:0.990   1st Qu.:0.990   1st Qu.: 0.990  
 Median :0.990   Median :0.990   Median :0.990   Median : 0.990  
 Mean   :1.007   Mean   :1.021   Mean   :1.035   Mean   : 1.407  
 3rd Qu.:0.990   3rd Qu.:0.990   3rd Qu.:0.990   3rd Qu.: 1.000  
 Max.   :4.000   Max.   :4.000   Max.   :7.000   Max.   :23.000  
 RETAILS_COUNT   
 Min.   :  0.99  
 1st Qu.:  1.00  
 Median :  6.00  
 Mean   : 35.70  
 3rd Qu.: 26.00  
 Max.   :987.00  
write_rds(flow_data1,
          "data/rds/flow_data_tidy.rds")

##SIM

flow_data2 <- read_rds("data/rds/flow_data_tidy.rds")
flow_data2 <- flow_data2 %>%
  mutate(ORIGIN_SZ = as.character(ORIGIN_SZ), DESTIN_SZ = as.character(DESTIN_SZ))
glimpse(flow_data2)
Rows: 161,094
Columns: 13
Groups: ORIGIN_SZ [2,137]
$ ORIGIN_SZ           <chr> "170", "170", "170", "173", "173", "173", "173", "…
$ DESTIN_SZ           <chr> "377", "462", "594", "301", "342", "554", "555", "…
$ TRIPS               <dbl> 4, 2, 20, 1, 5, 3, 31, 12, 1, 1, 9, 42, 23, 4, 14,…
$ FlowNoIntra         <dbl> 4, 2, 20, 1, 5, 3, 31, 12, 1, 1, 9, 42, 23, 4, 14,…
$ offset              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ DIST                <dbl> 992.1567, 1634.5871, 6139.0146, 2341.8742, 2087.91…
$ area_honeycomb_grid <POLYGON [m]> POLYGON ((5470.122 28214.15..., POLYGON ((…
$ n_colli             <int> 1, 1, 5, 2, 2, 4, 4, 6, 1, 1, 4, 4, 6, 2, 2, 1, 1,…
$ SCHOOL_COUNT        <dbl> 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.…
$ RTSS_COUNT          <dbl> 0.99, 0.99, 1.00, 0.99, 0.99, 0.99, 0.99, 1.00, 0.…
$ ENTERTN_COUNT       <dbl> 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.…
$ LR_COUNT            <dbl> 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.…
$ RETAILS_COUNT       <dbl> 0.99, 0.99, 2.00, 0.99, 0.99, 0.99, 0.99, 9.00, 1.…
kable(head(flow_data2[, 1:5], n = 5))
ORIGIN_SZ DESTIN_SZ TRIPS FlowNoIntra offset
170 377 4 4 1
170 462 2 2 1
170 594 20 20 1
173 301 1 1 1
173 342 5 5 1
flow_data2$FlowNoIntra <- ifelse(
  flow_data2$ORIGIN_SZ == flow_data2$DESTIN_SZ, 
  0, flow_data2$TRIPS)
flow_data2$offset <- ifelse(
  flow_data2$ORIGIN_SZ == flow_data2$DESTIN_SZ, 
  0.000001, 1)
inter_zonal_flow <- flow_data2 %>%
  filter(FlowNoIntra > 0)